In [ ]:
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# DISCRIMINANT ANALYSIS – LDA and QDA
# Import necessary libraries
! pip install pandas;
! pip install numpy;
! pip install scikit-learn;
! pip install ISLP;
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score
In [3]:
# Load dataset from the web site
# url = "http://fs2.american.edu/baron/www/627/R/Data_from_the_textbook/Auto.csv"
# Auto = pd.read_csv(url)
# or from the ISLP library
from ISLP import load_data
Auto = load_data('Auto')
In [4]:
# Define ECO ratings based on 'mpg'
Auto['mpg'].describe()
Out[4]:
count 392.000000 mean 23.445918 std 7.805007 min 9.000000 25% 17.000000 50% 22.750000 75% 29.000000 max 46.600000 Name: mpg, dtype: float64
In [5]:
Auto['ECO'] = pd.cut(Auto['mpg'], bins=[-np.inf, 17.5, 23, 29, np.inf],
labels=['Heavy', 'OK', 'Economy', 'Excellent'])
# Check the distribution of ECO
print(Auto['ECO'].value_counts())
ECO Heavy 104 OK 101 Excellent 95 Economy 92 Name: count, dtype: int64
In [6]:
# Ensure the target variable 'ECO' is treated as categorical
Auto['ECO'] = Auto['ECO'].astype('category')
# Prepare feature set (acceleration, year, horsepower, weight) and target (ECO)
X = pd.DataFrame(Auto[['acceleration', 'year', 'horsepower', 'weight']])
y = pd.Series(Auto['ECO'])
In [7]:
X.dtypes
Out[7]:
acceleration float64 year int64 horsepower int64 weight int64 dtype: object
In [8]:
Auto['horsepower'] = Auto['horsepower'].replace('?', np.nan)
In [9]:
# Linear Discriminant Analysis (LDA)
lda = LDA()
lda.fit(X, y)
# Predict using the trained LDA model
y_pred_lda = lda.predict(X)
In [10]:
# Confusion matrix and accuracy for LDA
conf_matrix_lda = confusion_matrix(y, y_pred_lda)
accuracy_lda = accuracy_score(y, y_pred_lda)
In [11]:
print("LDA Confusion Matrix:")
print(conf_matrix_lda)
print("LDA Accuracy: {:.4f}".format(accuracy_lda))
LDA Confusion Matrix: [[64 17 0 11] [12 81 0 2] [ 0 0 90 14] [21 1 4 75]] LDA Accuracy: 0.7908
In [12]:
# LDA with specified prior probabilities
lda_prior = LDA(priors=[0.25, 0.25, 0.25, 0.25])
y_pred_lda_prior = cross_val_predict(lda_prior, X, y, cv=5)
lda_prior.fit(X, y)
# Confusion matrix and accuracy with specified priors
conf_matrix_lda_prior = confusion_matrix(y, y_pred_lda_prior)
accuracy_lda_prior = accuracy_score(y, y_pred_lda_prior)
print("LDA with Prior Confusion Matrix:")
print(conf_matrix_lda_prior)
print("LDA with Prior Accuracy: {:.4f}".format(accuracy_lda_prior))
LDA with Prior Confusion Matrix: [[54 23 0 15] [20 73 0 2] [ 0 0 88 16] [22 3 4 72]] LDA with Prior Accuracy: 0.7321
In [13]:
# Quadratic Discriminant Analysis (QDA)
qda = QDA(priors=[0.25, 0.25, 0.25, 0.25])
y_pred_qda = cross_val_predict(qda, X, y, cv=5)
qda.fit(X, y)
# Confusion matrix and accuracy for QDA
conf_matrix_qda = confusion_matrix(y, y_pred_qda)
accuracy_qda = accuracy_score(y, y_pred_qda)
print("QDA Confusion Matrix:")
print(conf_matrix_qda)
print("QDA Accuracy: {:.4f}".format(accuracy_qda))
QDA Confusion Matrix: [[51 21 0 20] [25 68 0 2] [ 0 0 86 18] [23 0 4 74]] QDA Accuracy: 0.7117
In [14]:
# Posterior probabilities for first 20 observations
lda_posterior = lda.predict_proba(X[:20])
print("LDA Posterior Probabilities (first 20 rows):")
print(pd.DataFrame(lda_posterior, columns=lda.classes_))
# Summing the posterior probabilities for each row (should add up to 1)
print("Row sums of posterior probabilities:")
print(np.sum(lda_posterior, axis=1))
LDA Posterior Probabilities (first 20 rows): Economy Excellent Heavy OK 0 7.168583e-04 6.070189e-07 8.693823e-01 0.129900 1 3.912607e-05 1.434172e-08 9.914524e-01 0.008508 2 8.622131e-04 7.950708e-07 9.215251e-01 0.077612 3 1.030914e-03 8.918585e-07 9.157284e-01 0.083240 4 8.514916e-04 8.599892e-07 8.914880e-01 0.107660 5 9.967844e-09 4.559089e-13 9.999710e-01 0.000029 6 4.349652e-09 1.680365e-13 9.999910e-01 0.000009 7 7.284857e-09 3.453575e-13 9.999857e-01 0.000014 8 2.024389e-09 5.648859e-14 9.999952e-01 0.000005 9 2.355882e-06 5.395486e-10 9.991637e-01 0.000834 10 1.178452e-04 6.711477e-08 9.860518e-01 0.013830 11 6.617738e-05 4.181339e-08 9.851480e-01 0.014786 12 1.936976e-05 7.738634e-09 9.899570e-01 0.010024 13 6.428759e-03 8.812579e-06 9.732992e-01 0.020263 14 4.651891e-01 1.178638e-02 4.783595e-04 0.522546 15 8.572947e-02 5.480650e-04 1.428626e-02 0.899436 16 1.165114e-01 8.680732e-04 1.052362e-02 0.872097 17 2.107162e-01 2.914475e-03 1.625043e-03 0.784744 18 6.784941e-01 3.844641e-02 4.224588e-05 0.283017 19 7.864024e-01 1.003046e-01 2.624296e-07 0.113293 Row sums of posterior probabilities: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]